---
title: "Journal Data Policy"
author: "Sebastian Karcher"
date: "June 29, 2019"
output:
  pdf_document: default
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)

library(tidyr)
library(dplyr)
library(ggplot2)
library(extrafont)
library(margins)
library(here)
library(memisc)
```

### Reading in and preparing the data

```{r data}
jpolicy <- read.csv(file = here("journal_data_policy_20190629.tsv"), sep = "\t", encoding = "UTF-8")


# Converting strings to character variables
jpolicy$title <- as.character(jpolicy$title) 
jpolicy$ISSN <- as.character(jpolicy$ISSN)
jpolicy$policy_text <- as.character(jpolicy$policy_text)
jpolicy$policy_link <- as.character(jpolicy$policy_link)
jpolicy$nomenclature <- as.character(jpolicy$nomenclature)

```

### Generate Codebook

```{r generate codebook}
library(memisc)
codebook_data <- as.data.set(jpolicy)
codebook_data <- within(codebook_data, {
  description(discipline) <- "Discipline"
  description(ranking) <- "Rank within Discipline"
  description(title) <- "Journal title"
  description(publisher) <- "Journal publisher"
  description(IF) <- "Journal impact factor"
  description(ISSN) <- "Journal ISSN"
  description(language) <- "Journal publication language"
  description(frequency) <- "Publication frequency"
  description(year) <- "Year of first publication"
  description(policy_text) <- "Excerpt from data policy"
  description(policy_link) <- "Link to data policy"
  description(DART) <- "Journal has signed JETS"
  description(TOPS) <- "Journal has signed TOP"
  description(has_policy) <- "Journal has data policy"
  description(source) <- "Source of data policy"
  description(strictness) <- "Strictness of data policy"
  description(qual_data) <- "Mention of qualitative data in policy"
  description(data_location) <- "Location for data sharing"
  description(submission_time) <- "Timing of data submission"
  description(publication_time) <- "Timing of data publication"
  description(nomenclature) <- "Nomenclature used for data"
  description(codebooks) <- "Mention of codebooks"
  description(scripts) <- "Mention of analysis scripts"
  description(benefits) <- "Mention of benefits of data sharing"
  description(citation) <- "Data citation instructions"
  description(placement) <- "Data availability in journal"
  description(legality) <- "Ethical/legal exemptions"
  description(consequences) <- "Consequence for non-compliance"
 foreach(x=c(DART, TOPS, has_policy, source, strictness, qual_data, data_location, submission_time, publication_time, codebooks, scripts, benefits, placement, legality, consequences),{
    measurement(x) <- "nominal"
    })
  labels(DART) <-c("No" =0, "Yes" =1)
  labels(TOPS) <- c("No" =0, "Yes" =1)
  labels(has_policy) <- c("No" =0, "Yes" =1)
  labels(source) <- c("No policy" =0, "Journal" =1, "Publisher" =2, "Association"=3)
  labels(strictness) <- c("No policy"=0, "Encourage"=1, "Require" =2)
  labels(qual_data) <- c("No mention" =0, "Implicit" =1, "Explicit" =2)
  labels(data_location) <-  c('No policy' =0, 'On request'=1,'No location specified'=2,'Author website' = 3, 'Journal'=4,'A repository'=5,'Specific repository'=6)
  

  labels(submission_time) <- c("No data policy" =0, "No timing specified" =1, "After publication" =2, "After embargo" =3, "On publication" = 4, "On acceptance" =5, "On submission" =6, "Before submission" =7)
  labels(publication_time) <- c("No data policy" =0, "No timing specified" =1, "After embargo"=2, "After publication"=3, "Explicit on publication"=4)
  labels(codebooks) <- c("No mention" =0, "Mentioned" =1, "Required" =2)
  labels(scripts) <- c("No mention" =0, "Mentioned" =1, "Required" =2)
  labels(benefits) <- c("No" =0, "Yes" =1)
  labels(citation) <- c("No mention" =0, "Mentions citing" =1, "Specific Template" =2)
  labels(placement) <- c("No" =0, "Yes" =1)
  labels(legality) <- c("No" =0, "Yes" =1)
  labels(consequences) <- c("No" =0, "Yes" =1)
 

  wording(discipline) <- "Discipline the journal is listed under in the Journal Citation Report"
  wording(ranking) <- "Ranking of the journal by impact factor within its discipline in the Journal Citation Report"
  wording(publisher) <- "The journal's publisher. Where a journal is published by a publisher 'on behalf of' a professional organization, we list the publisher, not the association"
  wording(IF) <- "The journal's 2016 Thomson Reuters (now Claryvate) Impact Factor"
  wording(ISSN) <- "The journal's International Standard Serial Number (ISSN)"
  wording(language) <- "In which language(s) does the journal currently publish articles?"
  wording(frequency) <- "How many issues does the journal publish per year?"
  wording(year) <- "In which year was the journal first published?"
  wording(policy_text) <- "Relevant passages from the data policy unless too long"
  wording(DART) <- "Has the journal signed onto the DA-RT Journal Editors' Transparency Statement? (political science only, by time of data collection fall 2017)"
  wording(TOPS) <- "Has the journal signed onto the TOP guidelines? (by time of data collection fall 2017)"
  wording(has_policy) <- "Does the journal have a data policy? Any mention of depositing data in the author guidelines or directly linked from the author guidlines is coded as 'Yes'"
  wording(source) <- "What is the source of the data policy? Defaults to journal. Associations are coded by explicit mention. Publishers are coded based on links or common text blocks between different journals of the same publisher"
  wording(strictness) <- "Does the journal require data sharing or just mention/encourage it? Any mention or clear implication that authors 'are expected to,' 'have to,' 'are required,' 'must,' etc. share data is coded as 'Require'"
  wording(qual_data) <- "Does the data policy mention qualitative data? Implicit mention includes inclusion of typically qualitative materials such as transcripts, field notes, or images in the data policy"
  wording(data_location) <- "Where should shared data be deposited? Codes preferred option where multiple are given"
  wording(submission_time) <- "When should authors submit the data in relation to the article they accompany?"
  wording(publication_time) <- "When are data published in relation to the publication time of the article they accompany?"
  wording(nomenclature) <- "What language is used to describe data and other materials?"
  wording(codebooks) <- "Does the data policy mention or require the deposit of codebooks?"
  wording(scripts) <- "Does the data policy mention or require the deposit of analysis scripts? Any mention of computer code, statistical code, replication code and similar was coded as pertainint to analysis scripts"
  wording(benefits) <- "Does the data policy mention any benefits of sharing data?"
  wording(citation) <- "Does the journal specify how data should be cited? (Can refer to either authors' own or secondary data)"
  wording(placement) <- "Does the journal specify that replication data should be mentioned in the manuscript? Originally included multiple coding options but collapsed to yes/no based on coder feedback"
  wording(legality) <- "Does the policy mention any exception or allowance for legal concerns related to data sharing (e.g., proprietary data, human participants data, copyright)?"
  wording(consequences) <- "Does the journal specify consequences for not sharing data? (e.g., article will not be published, editors' statement of concerns, etc.)"
})

filename_codebook = paste("codebook_", Sys.Date(), sep = "", ".txt")
policy_codebook <- codebook(codebook_data)
Write(policy_codebook, file = filename_codebook)
```




### Recode data and generate new variables
```{r recode data}

# Turn categorical variables into factors
jpolicy$DART <- factor(jpolicy$DART, labels=c("No", "Yes"))
jpolicy$TOPS <- factor(jpolicy$TOPS, labels=c("No", "Yes"))
jpolicy$has_policy <- factor(jpolicy$has_policy, levels = c(0, 1), labels=c("No", "Yes"))
jpolicy$source <-  factor(jpolicy$source, labels=c("No policy", "Journal", "Publisher", "Association"))
jpolicy$strictness <- factor(jpolicy$strictness, labels=c("No policy", "Encourage", "Require"))
jpolicy$qual_data <- factor(jpolicy$qual_data, labels=c("No mention", "Implicit", "Explicit"))
jpolicy$codebooks <- factor(jpolicy$codebooks, labels=c("No mention", "Mentioned", "Required"))
jpolicy$benefits <- factor(jpolicy$benefits, labels=c("No", "Yes"))
jpolicy$citation <- factor(jpolicy$citation, labels=c("No mention", "Mentions citing", "Specific Template"))
jpolicy$legality <- factor(jpolicy$legality, labels=c("No", "Yes"))
jpolicy$consequences <- factor(jpolicy$consequences, labels=c("No", "Yes"))
jpolicy$scripts <- factor(jpolicy$scripts, labels=c("No mention", "Mentioned", "Required"))

jpolicy$publication_time <- factor(jpolicy$publication_time, levels=c("0", "1", "2", "3", "4"), labels=c("No data policy", "No timing specified", "After embargo", "After publication", "Explicit on publication"))

jpolicy$submission_time <- factor(jpolicy$submission_time, levels = c("0", "1", "2", "3", "4", "5", "6", "7"), labels= c("No data policy", "No timing specified", "After publication", "After embargo", "On publication", "On acceptance", "On submission", "Before submission"))

jpolicy$placement <- factor(jpolicy$placement, labels=c("No", "Yes"))




# Generate new variables
jpolicy$age <- 2017 - jpolicy$year

## Consolidating the Location variable
##The rational for this is explained below, but let's do all recoding here
jpolicy$data_location_simple <-dplyr::recode_factor(jpolicy$data_location, `0` = "No data policy", `1`= "Other", `2`= "Other", `3`= "Other",`4` = "Journal", `5` = "Repository",`6` = "Repository")


jpolicy$data_location <- factor(jpolicy$data_location, levels = c("0", "1", "2", "3", "4", "5", "6"), labels = c('No policy', 'On request','No location specified', 'Author website', 'Journal','A repository','Specific repository'))

## Create a binary strictness variable for the with policy data
jpolicy$bin_strictness <- factor(ifelse(jpolicy$strictness == "Require", "Require", "Don't require"))


# List and remove duplicates
jpolicy$title[duplicated(jpolicy$title)]
jpolicy_dedup <-  distinct(jpolicy, title, .keep_all = TRUE)

# Create a subset with only journals that have a policy
jpolicy_with_policy <- subset(jpolicy, has_policy =="Yes")
# And one with without the duplicates
jpolicy_with_policy_dedup <- subset(jpolicy_dedup,has_policy == "Yes")


```



### Policy by discipline
This generates *figure 1* and the total number of journals with a policy (the latter taken without duplicates)

```{r Policy by discipline}
jpolicy_dedup %>% dplyr::select(has_policy) %>% table()
policyplot <- ggplot(data=jpolicy, aes(has_policy, group="discipline")) +
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
  geom_text(aes(label = scales::percent_format(accuracy=1)(..prop..),
                   y= ..prop.. ), stat= "count", family= "Open Sans", size = 3, vjust = 1) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("red", "dodgerblue")) +
  labs(title ="Percentage of Journals with Data Policy by Discipline", y = "Percent", x = "Has Data Policy") +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  facet_wrap(~ discipline, nrow = 2) + 
  theme(text = element_text(family = "Open Sans"), strip.text = element_text(face = "bold"), panel.grid = element_blank())

print(policyplot)
ggsave("Has Policy By Discipline.png", plot = policyplot)
```


### Strictness by discipline
This generates *figure 2* and the number of journals with a strict policy.
Using the whole dataset for the graph but only interested in the deduped journals for the number

```{r strictness by discipline}
jpolicy_dedup %>% filter(has_policy=="Yes") %>% dplyr::select(strictness) %>% table()
strictnessplot <- ggplot(data=jpolicy, aes(factor(strictness, labels=c("No policy", "Encourage", "Require")), group="discipline")) +
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
  geom_text(aes( label = scales::percent_format(accuracy = 1)(..prop..),
                   y= ..prop.. ), stat= "count",family= "Open Sans", size = 3, vjust = 1) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("red", "dodgerblue", "green")) +
  labs(title ="Percentage of Journals by Strictness of Data Policy", y = "Percent", x = "Policy Strictness") +
  scale_y_continuous(labels=scales::percent_format(accuracy=1)) +
  facet_wrap(~ discipline, nrow = 2) + 
  theme(text = element_text(family = "Open Sans"), strip.text = element_text(face = "bold"), panel.grid = element_blank())

print(strictnessplot)
ggsave("Policy Strictness by Discipline.png", plot = strictnessplot)
```

### Policy by policy source

We're running this only on deduped journals with policy. This produces *figure 3*.

```{r strictness by source}
sourcestrictplot <- jpolicy_with_policy_dedup %>% ggplot(aes(x = as.factor(strictness), group="source")) +
  geom_bar(aes(x = strictness, fill = factor (..x..)), stat="count") +
  geom_text(aes(label = ..count..), stat= "count", family= "Open Sans", size = 3, vjust = 1) +
  guides(fill=FALSE) +
    scale_fill_manual(values = c("red", "dodgerblue")) +
  labs(title ="Strictness of Data Policy by Policy Source", y = "Number of Journals", x = "Policy Strictness") +
  scale_x_discrete(labels = c("Encourage", "Require"))+
  facet_wrap(~ source) + 
  theme(text = element_text(family = "Open Sans"), strip.text = element_text(face = "bold"), panel.grid = element_blank())
print(sourcestrictplot)
ggsave("Strictness by Policysource.png", plot = sourcestrictplot)

```

### Data Policy and Journal Ranking

The regressions underly the "Data Policy by Journal Ranking and Age" section. We only report estimated effect sizes for statisticially significant effects. The predicted effects and CIs are taken from the `head()` and `tail()` of the `cplot` for each regression.


```{r policy by IF regression}
policymodel <- glm(has_policy ~ discipline + IF, data = jpolicy_dedup, family = binomial)
summary(policymodel)
pdf("Effects of Journal Impact Factor on Data Policy.pdf", width = 11, height = 8)
margpolicyIFplot <- cplot(policymodel, x="IF", xlab ="Journal Impact Factor", main = "Predicted probability of having a data policy by Impact Factor", ylab = "Pr(data policy)", se.type="shade")
dev.off()
head(margpolicyIFplot)
tail(margpolicyIFplot)


cplot(policymodel, x="IF", xlab ="Journal Impact Factor", main = "Predicted probability of having a data policy by Impact Factor", ylab = "Pr(data policy)", se.type="shade")


```


Let's do the same thing but with strictness. We're limitting this, again, to journals with a policy

```{r strictness by IF}

## we now run this as a logit model with just the journals that have a policy

jpolicy_with_policy_dedup %>% dplyr::select(bin_strictness) %>%summary()
strictnessmodel <- glm(bin_strictness  ~ discipline + IF, data = jpolicy_with_policy_dedup, family = binomial)
summary(strictnessmodel)
```

The effect of IF on the strictness of data is quite weak if at all existent. Let's try the same with ranking, which measures the same thing but gives much less impact to outliers at the top.

We create the figure as both Cplot and ggplot, but will used the ggplot as **Figure 4a** in the table.


```{r has policy and Ranking}
policyRankingmodel <- glm(has_policy  ~ discipline + ranking, data = jpolicy_dedup, family = binomial)
summary(policyRankingmodel)
pdf("Effects of Journal Rank on Data Policy.pdf", width = 11, height = 8)
margpolicyplot <- cplot(policyRankingmodel, x="ranking", se.type="shade")
dev.off()

margpolicyggplot <- ggplot(margpolicyplot, aes(x = xvals)) + 
  geom_line(aes(y = yvals)) +
  geom_line(aes(y = upper), linetype = 2) +
  geom_line(aes(y = lower), linetype = 2) +
  geom_hline(yintercept = 0) +
  ggtitle("Predicted probability of having a journal policy by journal rank") +
  xlab("Journal rank") + ylab("Predicted probability") +
  theme(text = element_text(family = "Open Sans"), panel.grid = element_blank(), panel.background = element_blank())
ggsave("Pr-policy by rank.png", plot = margpolicyggplot)

head(margpolicyplot)
tail(margpolicyplot)
```
We do the same for strictness. The ggplot version of the plot if **Figure 4b** in the paper.

```{r strictness and Ranking}
strictnessRankingmodel <- glm(bin_strictness  ~ discipline + ranking, data = jpolicy_with_policy_dedup, family = binomial)
summary(strictnessRankingmodel)
margstrict <- margins(strictnessRankingmodel)
margstrictplot <- cplot(strictnessRankingmodel, x="ranking", se.type="shade")
head(margstrictplot)
tail(margstrictplot)
margstrictplotgg <- ggplot(margstrictplot, aes(x = xvals)) + 
  geom_line(aes(y = yvals)) +
  geom_line(aes(y = upper), linetype = 2) +
  geom_line(aes(y = lower), linetype = 2) +
  geom_hline(yintercept = 0) +
  ggtitle("Predicted probability of required data sharing by journal rank") +
  xlab("Journal ranking") + ylab("Predicted Probability") +
  theme(text = element_text(family = "Open Sans"), panel.grid = element_blank(), panel.background = element_blank())

ggsave("Probability Strict by ranking.png", plot = margstrictplotgg)
png("Effects of Journal Rank on Strictness of  Data Policy.png")
margstrictplot <- cplot(strictnessRankingmodel, x="ranking", se.type="shade")
dev.off()
```



### Journal Age
Let's see if journal age has an impact, controlling for discipline

```{r has policy and Age}
policyRankingmodel <- glm(has_policy  ~ discipline + age, data = jpolicy_dedup, family = binomial)
summary(policyRankingmodel)

```
That's a no. However, there's a sizable effect of age on strictness. The ggplot version of the below is **Figure 5**

```{r strictness and Age}
strictnessAgemodel <- glm(bin_strictness  ~ discipline + age, data = jpolicy_with_policy_dedup, family = binomial)
summary(strictnessAgemodel)

strictnessAgePlot <- cplot(strictnessAgemodel, data = jpolicy_with_policy_dedup, x="age", se.type="shade")
head(strictnessAgePlot)
tail(strictnessAgePlot)
png("Effects of Journal Rank on Strictness of  Data Policy.png")
cplot(strictnessAgemodel, x="age", se.type="shade")
dev.off()

strictnessAgePlotgg <- ggplot(strictnessAgePlot, aes(x = xvals)) + 
  geom_line(aes(y = yvals)) +
  geom_line(aes(y = upper), linetype = 2) +
  geom_line(aes(y = lower), linetype = 2) +
  geom_hline(yintercept = 0) +
  ggtitle("Predicted probability of required data sharing by journal age") +
  xlab("Journal age (in years)") + ylab("Predicted probability") +
  theme(text = element_text(family = "Open Sans"), panel.grid = element_blank(), panel.background = element_blank())

ggsave("Strictness by Age.png", plot = strictnessAgePlotgg)
```
### Data Location by Discipline
This is the graph using the full location categorization. We're collapsing categories 1-3 as there are hardly any journals in 1 and 3, making this graph unnecessarily cluttered.
```{r location by discipline}
ggplot(data=jpolicy_with_policy) + geom_bar(mapping = aes(x = data_location)) + facet_wrap(~ discipline, nrow = 2)
```

We have above recoded the location variable and this code produces *figure 6* as well as the numbers given in the "Where do journals tell authors to share data?" section of the analysis. 

```{r location by discipline consolidate}

datalocDisciplinePlot <- ggplot(data=jpolicy_with_policy, aes(data_location_simple, group="discipline")) + 
  geom_bar(aes(x = data_location_simple, fill = factor (..x..)), stat="count") +
  geom_text(aes(label = ..count..), stat= "count", family= "Open Sans", size = 3, vjust = 1) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("red", "dodgerblue", "green")) +
  labs(title ="Location of Data by Discipline", y = "Number of Journals", x = "Location") +
  facet_wrap(~ discipline, nrow = 2)+ 
  theme(text = element_text(family = "Open Sans"), strip.text = element_text(face = "bold"), panel.grid = element_blank())
print(datalocDisciplinePlot)
ggsave("Location of Data by Discipline.png", plot = datalocDisciplinePlot)
  
```


### Data Location by policy source
This produces **figure 7**
```{r location by source}

datalocSourcePlot <- ggplot(data=jpolicy_with_policy_dedup, aes(data_location_simple, group="source")) + 
  geom_bar(aes(x = data_location_simple, fill = factor (..x..)), stat="count") +
  geom_text(aes(label = ..count..), stat= "count", family= "Open Sans", size = 3, vjust = 1) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("red", "dodgerblue", "green")) +
  labs(title ="Location of Data by Policy Source", y = "Number of Journals", x = "Location") +
  facet_wrap(~ source) + 
  theme(text = element_text(family = "Open Sans"), strip.text = element_text(face = "bold"), panel.grid = element_blank())
  
print(datalocSourcePlot)
ggsave("Location of Data by Policy Source.png", plot = datalocSourcePlot)

```

### Data Strictness by TOPS
Let's first look at whether TOP signers are more likely to have a policy. This and the following tables produce the numbers reported in the section "Effect of TOP and DA-RT on data policies"
```{r has policy by TOPS}
table(jpolicy_dedup$TOPS, distinct(jpolicy, title, .keep_all = TRUE)$has_policy)
```

Among those journals with a data policy, does TOPS correlate with a stronger policy? We're using the binary variable we generated above
```{r strictness  by TOPS}
table(jpolicy_with_policy_dedup$TOPS, jpolicy_with_policy_dedup$bin_strictness, useNA = "no")
```
We did check TOP journals by discipline to see if this was driving the results. It is not and we don't report this in the paper
```{r TOPS by discipline}
ggplot(data=jpolicy) + geom_bar(mapping = aes(x = TOPS))+ facet_wrap(~ discipline, nrow = 2)
```



### DA-RT

Let's see if DA-RT also affects the strictness of policy among journals who have one at all:
```{r strictness by DART}
table(jpolicy_with_policy$DART, jpolicy_with_policy$bin_strictness, useNA = "no")
jpolicy_with_policy %>% 
  drop_na(DART) %>%
  ggplot() + geom_bar(mapping = aes(x = bin_strictness)) + facet_wrap(~ DART, nrow = 2)
```

Here are the tables for JETS/DA-RT. We're running this on the whole dataset since DART is only coded for polisci anyway, so there are no duplicates..

```{r has policy by DART}
table(jpolicy$DART, jpolicy$has_policy, useNA = "no")
```


### Qualitative Data by Discipline
Finally, we're interested to see how many journals talk about qualitative data. We're only looking at the journals that have any policy. The below table and graph are the source for the numbers in the section "Data policies and qualitative data"

```{r Qualitative Data}
table(jpolicy_dedup$qual_data)

qualdataPlot <- ggplot(data=jpolicy_dedup, aes(x = qual_data, group="discipline")) +
  geom_bar(aes(x = qual_data, fill = factor (..x..)), stat="count") +
  geom_text(aes(label = ..count..), family= "Open Sans", size = 3, stat= "count", vjust = 1) +
  guides(fill=FALSE) +
  scale_fill_manual(values = c("red", "dodgerblue", "green")) +
  labs(title ="Mention of qualitative data in policy by discipline", y = "Number of Journals", x = "Mentions of qualitative data") +
  facet_wrap(~ discipline, nrow=2)+ 
  theme(text = element_text(family = "Open Sans"), panel.grid = element_blank())
print(qualdataPlot)
ggsave("Qualitative Data by Discipline.png", plot = qualdataPlot)
```

### Benefits
This calculates the figures in the "Recommendation" section under "Include the benefits of data sharing"

```{r benefits}
table(jpolicy_with_policy_dedup$benefits, jpolicy_with_policy_dedup$source)

```


